library(tidyverse) # for data manipulation
library(DESeq2) # for Differential gene expression analysis
library(ggplot2) # for plotting
library(plotly) # for interactive plots
library(GenomicRanges) # for creating and manipulating genomic ranges
library(mixOmics) # for PCA
assay<- read.csv("assay.csv", row.names = 1) # Importing gene expression assay data
colData <- read.csv("colData.csv") # Importing clinical information
temp<- read.csv("rawRange.csv") # Importing rawRanges as dataframe
rowRanges <- GRanges( # creating rawRanges from dataframge
seqnames = Rle(temp$chr), # Chromosome names
ranges = IRanges(start = temp$start, end =temp$end), # Start and End positions
strand = Rle(temp$strand), # Strand information (if available)
data.frame(Gene = temp$Gene, Length = temp$length)) # Additional columns
print("The count data as Assay: ")
## [1] "The count data as Assay: "
head(assay)
## HK16 HK19 HK24 HK25 HK26 WT16 WT19 WT24 WT25 WT26
## DDX11L1 0 0 0 1 0 0 0 0 0 0
## WASH7P 19 33 50 38 39 25 66 70 43 39
## MIR1302-11 0 0 1 0 0 0 0 1 0 1
## FAM138A 0 0 0 0 0 0 0 0 0 0
## OR4G4P 0 0 0 0 0 0 0 0 0 0
## OR4G11P 0 0 0 0 0 0 0 0 0 0
print("The clinical data data as colData: ")
## [1] "The clinical data data as colData: "
head(colData)
## Sample_Name Sample_Group Sex Stage Batch
## 1 HK16 HK Male HK A
## 2 HK19 HK Female HK A
## 3 HK24 HK Male HK A
## 4 HK25 HK Female HK A
## 5 HK26 HK Male HK A
## 6 WT16 WT Male I-II A
print("The gene coordinates as rawRanges ")
## [1] "The gene coordinates as rawRanges "
head(rowRanges)
## GRanges object with 6 ranges and 2 metadata columns:
## seqnames ranges strand | Gene Length
## <Rle> <IRanges> <Rle> | <character> <integer>
## [1] chr1 11869-12227 + | DDX11L1 1756
## [2] chr1 14363-14829 - | WASH7P 2073
## [3] chr1 29554-30039 + | MIR1302-11 1021
## [4] chr1 34554-35174 - | FAM138A 1219
## [5] chr1 52473-53312 + | OR4G4P 947
## [6] chr1 62948-63887 + | OR4G11P 940
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
nreads <- data.frame(reads = colSums(assay)) %>% # Calculating library size by summing read counts for each sample
rownames_to_column("sample") %>%
mutate(Sample_Group = case_when(grepl("HK", sample) ~ "HK",
grepl("WT", sample) ~ "WT"))
lsize <- ggplot(nreads, aes(x=Sample_Group, y=reads, fill=Sample_Group,label=sample)) +
geom_boxplot() +
geom_jitter(width = 0.25) +
scale_fill_manual(values = c("forest green", "steel blue")) +
theme_minimal() +
ggtitle("Library size")
ggplotly(lsize)
Filtering low-count genes is a common preprocessing step in RNA-Seq data analysis, especially when using tools like DESeq2 for differential expression analysis. Here’s why it’s important:
Genes with low counts typically have high variability due to the limited sampling of sequencing reads. This variability makes it difficult to reliably detect significant differences in expression levels between conditions.
Differential expression analysis involves statistical tests for thousands of genes. Each test adds to the multiple-testing burden, requiring more stringent p-value adjustments.
Low-count genes are often affected by: Sequencing noise: Random fluctuations during library preparation or sequencing. Mapping artifacts: Reads mapped spuriously to regions with minimal biological relevance.
assay.filtered <- assay[rowSums(assay > 0) >= 2 & rowSums(assay) >= 20,]
paste0("Number of genes before filtering: ", nrow(assay))
## [1] "Number of genes before filtering: 379"
paste0("Number of genes after filtering: ", nrow(assay.filtered))
## [1] "Number of genes after filtering: 201"
A Relative Log Expression (RLE) plot is a diagnostic tool commonly used in RNA-Seq data analysis to assess the normalization of a count matrix. It shows the log ratios of gene expression counts relative to their median across samples.
# Add pseudocount and calculate log2-transformed counts
log_counts <- log2(assay.filtered + 1)
# Calculate relative log expression
medians <- apply(log_counts, 1, median) # Median for each gene across samples
rle <- sweep(log_counts, 1, medians, FUN = "-") # Subtract medians (log scale)
# Reshape data for ggplot
rle_long <- reshape2::melt(rle)
colnames(rle_long) <- c("Sample", "Deviation")
rle_long$Sample_group <- rep(c("HK", "WT"), c(1005,1005))
# Plot RLE as a boxplot colored by Sample_Group
ggplot(rle_long, aes(x = Sample, y = Deviation, fill = Sample_group)) +
geom_boxplot(outlier.size = 0.5, outlier.colour = "red") +
scale_fill_manual(values = c("forest green", "steel blue")) +
theme_minimal() +
labs(
title = "RLE Plot before Normalization",
x = "Sample",
y = "Relative Log Expression (Deviation)",
fill = "Sample_group"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for clarity
In DESeq2, the primary data structure for storing RNA-Seq data and performing analyses is the DESeqDataSet (DDS) object. It is specifically designed to handle the raw counts and associated metadata for differential expression analysis.
A DESeqDataSet is an object from the S4 class system in R that holds:
DE.obj <- DESeqDataSetFromMatrix(countData = assay.filtered,
colData = colData,
design = ~ Sample_Group)
DESeq2 performs normalization using ‘size factor’ to account for differences in sequencing depth and other systematic biases across samples in RNA-Seq datasets.
The size factor is a scaling factor used in RNA-Seq analysis to normalize read counts across samples, accounting for differences in sequencing depth or library size. This ensures that comparisons of gene expression levels are not biased by variations in the total number of reads per sample.
DE.obj <- estimateSizeFactors(DE.obj) # Estimating size factor
DE.normalized <- counts(DE.obj, normalized = T) # Normalizing counts
# Add pseudocount and calculate log2-transformed counts
log_counts <- log2(DE.normalized + 1) %>% as.data.frame()
# Calculate relative log expression
medians <- apply(log_counts, 1, median) # Median for each gene across samples
rle <- sweep(log_counts, 1, medians, FUN = "-") # Subtract medians (log scale)
# Reshape data for ggplot
rle_long <- reshape2::melt(rle)
colnames(rle_long) <- c("Sample", "Deviation")
rle_long$Sample_group <- rep(c("HK", "WT"), c(1005,1005))
# Plot RLE as a boxplot colored by Sample_Group
ggplot(rle_long, aes(x = Sample, y = Deviation, fill = Sample_group)) +
geom_boxplot(outlier.size = 0.5, outlier.colour = "red") +
scale_fill_manual(values = c("forest green", "steel blue")) +
theme_minimal() +
labs(
title = "RLE Plot after Normalization",
x = "Sample",
y = "Relative Log Expression (Deviation)",
fill = "Sample_group"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for clarity
Principal Component Analysis (PCA) is a dimensionality reduction technique widely used in data analysis, including RNA-Seq or other high-dimensional datasets. It transforms the data into a new coordinate system such that the greatest variance lies along the first principal component (PC1), the second greatest variance along the second component (PC2), and so on.
pca1<- pca(t(DE.normalized), center = T, scale = T) # Performing PCA
plotIndiv(pca1, group = colData$Sample_Group, ind.names = F,
title = "Total expression profile",
legend = T, legend.position = "bottom", col.per.group = c("forest green", "steel blue"))
DESeq2 performs differential expression gene (DEG) analysis using a model-based approach designed specifically for count data from RNA-Seq experiments. It employs generalized linear models (GLMs) and statistical techniques to identify genes with significant differences in expression across conditions.
DE.obj$Sample_Group <- relevel(DE.obj$Sample_Group, ref = "HK") # Setting reference group
DE.obj <- DESeq(DE.obj) #Fitting the model
res.con <- results(DE.obj, # Extracting result table
contrast=c("Sample_Group", "WT", "HK"), # Setting Contrast
pAdjustMethod = "BH", # Using 'BH' for p-value adjustment
alpha = 0.1) # Adjusted p-value threshold
res.con <- res.con[!is.na(res.con$padj),] # Removing genes with NA adj p-values
volcano <- as.data.frame(res.con) %>%
rownames_to_column("gene") %>%
mutate(Logpadj=-log10(padj)) %>%
mutate(sig=case_when(log2FoldChange >= 1 & padj < 0.05 ~ "Up",
log2FoldChange <= -1 & padj < 0.05 ~ "Down")) %>%
ggplot(aes(x=log2FoldChange, y=Logpadj, color=sig, label=gene)) +
geom_point(size=1) +
theme_bw() +
ylab("-Log10(FDR)") +
theme(legend.position = "none") +
scale_color_manual(values = c("firebrick", "steelblue")) +
geom_vline(xintercept = c(-1, 1), col = "gray", linetype = 'dashed') +
geom_hline(yintercept = -log10(0.05), col = "gray", linetype = 'dashed') +
ggtitle("Volcano DEGs")
ggplotly(volcano)